# -*- coding: utf-8 -*-

import os


DATA_DIR = "ccks_7_1_competition_data"
TRAINING_DIR = "训练集"
TESTING_DIR = "验证集"
training_file = "entity_type.txt"
testing_file = "test.txt"
pages_file = ["entity_pages_1.xml", "entity_pages_2.xml", "entity_pages_3.xml", "entity_pages_4.xml"]


UNK, PAD = "<UNK>", "<PAD>"
char_vocabs = {PAD: 0, UNK: 1}
id2char = [PAD, UNK]
count = 0

lines = 0
for page_file in pages_file:
    with open(os.path.join(DATA_DIR, TRAINING_DIR, page_file), 'r') as f:
        for line in f:
            for c in line:
                if c not in char_vocabs:
                    char_vocabs[c] = count + 2
                    id2char.append(c)
                    count += 1
            lines += 1
            print("\rprocessed %d lines" % lines, end="")
    pass
print(" done. ")


out_vocabs_file = "char_vocabs.txt"

with open(out_vocabs_file, 'w') as f:
    for c in id2char:
        f.write(c+'\n')
print("Write to %s " % out_vocabs_file)
